# Load required packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(emmeans)
library(lme4)
## Loading required package: Matrix
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(here)
## here() starts at /Users/roundtable/Documents/github-bootstrapping
library(ltm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: msm
## Loading required package: polycor
library(glmnet)
## Loaded glmnet 4.1-4
library(ggplot2)
library(stringi)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggpattern)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
print(here())
## [1] "/Users/roundtable/Documents/github-bootstrapping"
source(here("data_analysis/multi-choice-utils.R"))
# Set up tableau color palette
tableau_palette = c("#4E79A7","#F28E2B","#E15759","#76B7B2","#59A14F","#EDC948","#B07AA1","#FF9DA7","#9C755F","#BAB0AC")
raw_experiment_data = read.csv('../experiment_data/raw_experiment_data.csv')

bad_workers = raw_experiment_data %>%
  group_by(workerId_hash) %>%
  summarize(
    n=n()
  ) %>%
  subset(n != 48)

experiment_data = raw_experiment_data %>%
  subset(!(workerId_hash %in% bad_workers$workerId_hash)) %>%
  identify_junk() %>%
  subset(is_practice == "False") %>%
  subset(passed_check == T) %>%
  mutate(
    trial_id =  paste(workerId_hash,hitId,predictor_city,sep='-'),
    year = 2023,
  ) %>%
  threshold_estimates()

bad_trials = experiment_data %>%
  group_by(trial_id) %>%
  summarize(num_temps = length(unique(temperature_estimate))) %>%
  subset(num_temps <= 2) %>%
  pull(trial_id) %>%
  unique()


experiment_data = experiment_data %>%
    mutate(
    bad_trial = trial_id %in% bad_trials,
    adjusted_temperature_estimate = case_when(
      bad_trial ~ temperature_estimate + rnorm(n(),0,0.01),
      TRUE ~ as.double(temperature_estimate)
    )
  ) %>%
  add_bootstrapping_estimates(4,F) %>%
  mutate(
      type = case_when(
        type == 'challenging' ~ 'Challenging',
        type == 'kind' ~ 'Kind',
        type == 'wicked' ~ 'Wicked'
      ),
      type = factor(type,levels=c('Kind','Challenging','Wicked'))
  )


kind_cities = c('Baltimore', 'Charlotte', 'Denver', 'Orlando', 'Portland', 'Sacramento', 'San Antonio', 'St. Louis')
challenging_cities = c('Cairo', 'Delhi', 'Lagos', 'London', 'Mexico City', 'Paris', 'Tokyo', 'Toronto')
wicked_cities = c('Auckland', 'Buenos Aires', 'Johannesburg', 'Lima', 'Luanda', 'Santiago','Sao Paulo', 'Sydney')
all_cities = c(kind_cities,challenging_cities,wicked_cities)
experiment_data$ordered_predictor_city = factor(experiment_data$predictor_city,levels=all_cities)
experiment_data$pretty_condition = ifelse(experiment_data$condition=='forced-no-model','Control','Model assistance')
# Estimate abilities from IRT model and add to experiment_data

# Prep worker data and irt data

# Add worker data to 
worker_data = read.csv("../experiment_data/worker_data.csv")

worker_data = worker_data %>%
  subset(workerId_hash %in% unique(experiment_data$workerId_hash))

irt_data = dplyr::select(worker_data,matches("correct"))
irt_data = irt_data[,order(colnames(irt_data))]
irt_data = data.frame(sapply(irt_data, \(x) +as.logical(x)))

# Convert to 0/1
#irt_model = ltm(irt_data ~ z1)
irt_model = tpm(irt_data) # Three-parameter model
# irt_model_og = ltm(irt_data ~ z1)
# Correlate scores from both models...

# og_ability = factor.scores(irt_model_og, resp.patterns = irt_data)$score.dat$z1
# new_ability =  factor.scores(irt_model, resp.patterns = irt_data)$score.dat$z1

# plot(og_ability,new_ability)

worker_data$ability = factor.scores(irt_model, resp.patterns = irt_data)$score.dat$z1
worker_data$num_correct = rowSums(irt_data)

# Merge worker data with experiment_test
experiment_data = experiment_data %>%
  merge(worker_data,by='workerId_hash',all.x=T)
# Overall error rates: bar plot
overall_errors = experiment_data %>%
  group_by(pretty_condition) %>%
  summarize(
    average_error = mean(abs_error),
    error_se = se(abs_error,n()) 
  ) %>%
  ggplot(aes(x=pretty_condition,y=average_error,fill=pretty_condition)) +
  geom_bar(stat='identity') +
  geom_errorbar(aes(ymin=average_error - error_se,ymax = average_error+error_se),
                size=0.7,width=0.4) +
  labs(
    x='Condition',
    y='Mean absolute error',
  ) + 
  scale_y_continuous(labels = function(y) paste0(y, "°"), expand = c(0, 0), limits=c(0,19)) +
  scale_fill_manual(values = tableau_palette) +
  theme_classic() +
  theme(
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5),
        axis.title.x=element_blank(),
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
overall_errors

# Look at overall error rates by type: bar plot
errors_by_type = experiment_data %>%
  group_by(pretty_condition,type) %>%
  summarize(
    average_error = mean(abs_error),
    error_se = se(abs_error,n()) 
  ) %>%
  ggplot(aes(x=type,y=average_error,fill=pretty_condition)) +
  geom_bar(stat='identity',position=position_dodge(0.9)) +
  geom_errorbar(aes(ymin=average_error - error_se,ymax = average_error+error_se),
                size=0.7,width=0.45,position=position_dodge(0.9)) +
  labs(
    x='Trial difficulty',
    y='Mean absolute error',
    fill = "Condition"
  ) +
  scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,19)) +
  scale_fill_manual(values = tableau_palette) +
  theme_classic() +
  theme(axis.title.x=element_blank())
## `summarise()` has grouped output by 'pretty_condition'. You can override using
## the `.groups` argument.
errors_by_type

city_data = experiment_data %>%
  group_by(pretty_condition,type,ordered_predictor_city) %>%
  summarize(
    average_error = mean(abs_error),
    average_error_se = se(abs_error,n())
  ) 
## `summarise()` has grouped output by 'pretty_condition', 'type'. You can
## override using the `.groups` argument.
average_error_by_city = city_data %>%
  ggplot(aes(x=pretty_condition,y=average_error,color=pretty_condition,shape=pretty_condition)) +
  geom_point(size=2) +
  geom_errorbar(aes(ymin=average_error-average_error_se,ymax=average_error+average_error_se),
                width=0.45,size=0.7) +
  facet_wrap(. ~ ordered_predictor_city, ncol=8) +
  scale_color_manual(values = tableau_palette) +
  scale_y_continuous(labels = function(y) paste0(y, "°")) + 
  labs(
    y='Mean absolute error',
    color='Condition',
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position='bottom'
    )

average_error_by_city

# Spaghetti plots: All passed check
predictor_city_highs = get_city_averages(unique(subset(experiment_data,model_assistance == "True")$predictor_city))

most_popular_model_city_highs = get_most_popular_model_cities(experiment_data) %>%
  mutate(
    city = factor(predictor_city,levels=all_cities),
    most_popular_model_city = case_when(
      most_popular_model_city == 'San Francisco' ~ 'SF',
      most_popular_model_city == 'New York' ~ 'NYC',
      most_popular_model_city == 'Washington' ~ 'DC',
      most_popular_model_city == 'Los Angeles' ~ 'LA',
      most_popular_model_city == 'Philadelphia' ~ 'Philly',
      most_popular_model_city == 'Minneapolis' ~ 'MPLS',
      T ~ most_popular_model_city
    )
  )
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
## Joining, by = "city"
best_model_city_highs = get_best_model_cities(experiment_data) %>%
  mutate(
    city = factor(predictor_city,levels=all_cities),
    best_model_city = case_when(
      best_model_city == 'San Francisco' ~ 'SF',
      best_model_city == 'New York' ~ 'NYC',
      best_model_city == 'Washington' ~ 'DC',
      best_model_city == 'Los Angeles' ~ 'LA',
      best_model_city == 'Philadelphia' ~ 'Philly',
      best_model_city == 'Minneapolis' ~ 'MPLS',
      T ~ best_model_city
    )
  )
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'predictor_city'. You can override using
## the `.groups` argument.
chosen_cities_spaghetti = predictor_city_highs %>%
  mutate(city = factor(city,levels=all_cities)) %>%
  ggplot(aes(x=month,y=average_hi)) +
  geom_line(aes(colour = "Target"), size=1,alpha=0.55) + # Removed alpha=0.5 here
  geom_line(data=most_popular_model_city_highs, aes(x=month,y=model_city_modeled_high, colour = "Most chosen"), 
            size=1, alpha=0.9) + # Kept alpha=0.5 for "Most chosen"
  geom_text(data=subset(most_popular_model_city_highs,month==2),
            aes(x=7.2,y=48,label=most_popular_model_city),
            color='#E15759') +
  geom_line(data=best_model_city_highs, aes(x=month,y=best_model_predicted_hi, colour = "Lowest mean error"), 
            size=1, alpha=0.55) + # Changed alpha for "Lowest mean error" to 0.2 (or whatever value you prefer)
  geom_text(data=subset(best_model_city_highs,month==10),
            aes(x=7.2,y=38,label=best_model_city),
            color='#76B7B2')+
  scale_x_continuous(breaks=1:12) +
  scale_y_continuous(labels = function(y) paste0(y, "°")) +
  facet_wrap(.~city,ncol=8) +
  labs(y='Average high', x='Month') + 
 scale_colour_manual(
    values = c("Target" = "black", 
               "Most chosen" = "#E15759", 
               "Lowest mean error" = "#76B7B2"),
    breaks = c("Target", "Most chosen", "Lowest mean error"), 
    guide = guide_legend(override.aes = list(alpha = 0.55)),
    name = "City type"
 ) +
  theme(
    axis.text.x = element_blank(), 
    axis.ticks.x = element_blank(),
    axis.title.x=element_blank(),
    legend.position = 'bottom'
  )


chosen_cities_spaghetti

# Plot overall bootstrapping and human errors

human_forecasts = experiment_data %>%
  transmute(
    pretty_condition ,
    error = abs_error,
    type,
    ordered_predictor_city,
    predictor_type = 'Human'
  )

bootstrap_forecasts = experiment_data %>%
  transmute(
    pretty_condition,
    error = bootstrap_error,
    type,
    ordered_predictor_city,
    predictor_type = 'Bootstrap'
  )

errors_df = rbind(human_forecasts,bootstrap_forecasts) %>%
  mutate(predictor_type = factor(predictor_type,levels=c('Human','Bootstrap')))

full_results_plot = errors_df %>%
  group_by(pretty_condition,predictor_type,type) %>%
  summarize(
    average_error = mean(error),
    error_se = se(error,n()),
    .groups = "drop"
  ) %>%
  ggplot(aes(x=pretty_condition, y=average_error, fill=pretty_condition, pattern=predictor_type)) +
  
  # Using geom_bar_pattern for patterned bars
  geom_bar_pattern(stat='identity', position=position_dodge(0.9),
                   pattern_density = 0.1,
                   pattern_spacing = 0.015,
                   pattern_fill="black",
                   pattern_angle = 45,
                   pattern_alpha = 0.5) +
  
  geom_errorbar(aes(ymin=average_error - error_se, ymax=average_error+error_se),
                size=0.7, width=0.4, position=position_dodge(0.9)) +
  labs(
    x = 'Model assistance',
    y = 'Mean absolute error',
    fill = 'Predictor type',
    pattern = "Predictor type"
  ) +
  scale_pattern_manual(values=c(Bootstrap = "stripe", Human = "none")) + 
  scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,19)) +
  scale_fill_manual(values=tableau_palette) +
  facet_grid(cols=vars(type)) +
  theme_classic() +
  theme(
    axis.title.x=element_blank(),
    strip.background = element_blank()
  )


full_results_plot

model_city_averages = unique(subset(experiment_data,model_assistance == 'True')$model_city) %>%
    get_city_averages() %>%
    get_model_predicted_highs()
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
experiment_data_2 = experiment_data %>%
  left_join(model_city_averages, by = c("model_city" = "city", "month" = "month")) %>%
  mutate(model_city_model_high = ifelse(is.na(model_city), NA, model_predicted_hi)) %>%
  mutate(
    model_city_error = abs(model_city_model_high - average_hi)
  )

grouped_data = experiment_data_2 %>%
  subset(model_assistance=='True') %>%
  group_by(type) %>%
  summarize(
    average_human_error = mean(abs_error),
    se_human_error = se(abs_error,n()),  # compute standard error for human
    average_model_error = mean(model_city_error),
    se_model_error = se(model_city_error,n()),  # compute standard error for human
  )

# Reshape data
df_long = grouped_data %>%
  pivot_longer(
    cols = c(starts_with("average"), starts_with("se")),
    names_to = c(".value", "method"),
    names_pattern = "(.+)_(.*)_error"
  )
# Change the method names to "human" and "model"
df_long$method <- ifelse(df_long$method == "human", "Human", "Chosen cities")

# Plot
humans_vs_cities = df_long %>%
  mutate(method = factor(method,levels=c('Human','Chosen cities'))) %>%
  ggplot(aes(x = type, y = average, fill = method)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_errorbar(aes(ymin=average - se,ymax = average+se),
                size=0.7,width=0.45,position=position_dodge(0.9)) +
    scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,18)) +
    scale_fill_manual(values=c("#B07AA1","#BAB0AC")) + 
    theme_classic() +
    theme(axis.title.x=element_blank()) +
    labs(y = "Mean absolute error", x = "Type", fill = "Predictor type")

humans_vs_cities

top_10 = quantile(subset(experiment_data,model_assistance=="True")$ability,0.9)

grouped_data_expert = experiment_data_2 %>%
  subset(ability > top_10) %>%
  subset(model_assistance=='True') %>%
  group_by(type) %>%
  summarize(
    average_human_error = mean(abs_error),
    se_human_error = se(abs_error,n()),  # compute standard error for human
    average_model_error = mean(model_city_error),
    se_model_error = se(model_city_error,n()),  # compute standard error for human
  )

# Reshape data
df_long_expert = grouped_data_expert %>%
  pivot_longer(
    cols = c(starts_with("average"), starts_with("se")),
    names_to = c(".value", "method"),
    names_pattern = "(.+)_(.*)_error"
  )
# Change the method names to "human" and "model"
df_long_expert$method <- ifelse(df_long_expert$method == "human", "Experts", "Chosen cities")

# Plot
experts_vs_cities = df_long_expert %>%
  mutate(method = factor(method,levels=c('Experts',"Chosen cities"))) %>%
  ggplot(aes(x = type, y = average, fill = method)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_errorbar(aes(ymin=average - se,ymax = average+se),
                size=0.7,width=0.45,position=position_dodge(0.9)) +
    scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,18)) +
    scale_fill_manual(values=c("#B07AA1","#BAB0AC")) + 
    theme_classic() +
    theme(axis.title.x=element_blank()) +
    labs(y = "Mean absolute error", x = "Type", fill = "Predictor type")

experts_vs_cities

# Plot ability on x axis, and average error on y axis
ability_plot = experiment_data %>%
  group_by(workerId_hash,pretty_condition,type) %>%
  summarize(
    ability = mean(ability),
    average_error = mean(abs_error)
  ) %>%
  ggplot(aes(x=ability,y=average_error,color=pretty_condition)) +
  geom_point(alpha=0.1) +
  scale_color_manual(values=tableau_palette) +
  scale_y_continuous(labels = function(y) paste0(y, "°"))+
  #geom_line(alpha=0.75,size=1.75) +
  geom_smooth(method = "lm", se = FALSE,linetype = 1, size = 1.75, alpha = 0.08) +
  labs(
    x = 'Ability',
    y='Mean absolute error',
    color = 'Condition'
  ) +
  theme_classic() +
  theme(
    strip.background = element_blank()
  ) +
  facet_grid(cols=vars(type))
## `summarise()` has grouped output by 'workerId_hash', 'pretty_condition'. You
## can override using the `.groups` argument.
ability_plot
## `geom_smooth()` using formula = 'y ~ x'

# Add preceeding and trailing month
experiment_data = experiment_data %>%
  mutate(
    preceding_month = ifelse(month==1,12,month-1),
    trailing_month = ifelse(month==12,1,month+1)
  )

# Add estimate for preceeding month to experiment_data
experiment_data = experiment_data %>%
  transmute(
    trial_id,
    preceeding_estimate = temperature_estimate,
    month = preceding_month
  ) %>%
  right_join(experiment_data,by=c('trial_id','month'))

# Add estimate for trailing month to experiment_data
experiment_data = experiment_data %>%
  transmute(
    trial_id,
    trailing_estimate = temperature_estimate,
    month = trailing_month
  ) %>%
  right_join(experiment_data,by=c('trial_id','month'))

experiment_data = experiment_data %>%
  mutate(
    interpolated_estimate = round((preceeding_estimate + trailing_estimate)/2),
    interpolated_error = abs(interpolated_estimate - average_hi)
  )

grouped_data = experiment_data %>%
  group_by(pretty_condition,type) %>%
  summarize(
    average_human_error = mean(abs_error),
    average_interpolated_error = mean(interpolated_error),
    se_human_error = se(abs_error,n()),
    se_interpolated_error = se(interpolated_error,n())
  )
## `summarise()` has grouped output by 'pretty_condition'. You can override using
## the `.groups` argument.
library(stringr)

# Reshape the data to long format
long_data <- grouped_data %>%
  pivot_longer(cols = starts_with("average_"),
               names_to = "predictor_type",
               values_to = "error") %>%
  pivot_longer(cols = starts_with("se_"),
               names_to = "se_type",
               values_to = "se")

# Separate the predictor types
long_data <- long_data %>%
  mutate(predictor_type = ifelse(str_detect(predictor_type, "human"), "Human", "Interpolated"),
         se_type = ifelse(str_detect(se_type, "human"), "Human", "Interpolated"))

# Filter out rows where predictor_type and se_type don't match (to ensure correct pairs)
long_data <- long_data %>% 
  filter(predictor_type == se_type)

interpolated_errors = errors_df %>%
  subset(predictor_type!='Human') %>%
  group_by(pretty_condition,type) %>%
  summarize(error = mean(error)) %>%
  mutate(predictor_type = 'Interpolated')
## `summarise()` has grouped output by 'pretty_condition'. You can override using
## the `.groups` argument.
interpolation_plot = long_data %>%
  ggplot(aes(x=pretty_condition,
             y=error, fill=pretty_condition, pattern=predictor_type)) +
  geom_bar_pattern(stat='identity', position=position_dodge(0.9),
                   pattern_density = 0.3,
                   pattern_spacing = 0.015,
                   pattern_fill="black",
                   pattern_angle = 45,
                   pattern_alpha = 0.6) +
  geom_errorbar(aes(ymin=error - se, ymax=error+se),
                size=0.7, width=0.4, position=position_dodge(0.9)) +
  labs(
    x = 'Model assistance',
    y = 'Mean absolute error',
    fill = 'Predictor type',
    pattern = "Predictor type"
  ) +
  scale_pattern_manual(values=c(Interpolated = "circle", Human = "none")) + 
  scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,19)) +
  scale_fill_manual(values=tableau_palette) +
  facet_grid(cols=vars(type)) + 
  geom_point(data=interpolated_errors,shape=18,size=3)+
  theme_classic() +
  theme(
    axis.title.x=element_blank(),
    strip.background = element_blank()
  )

interpolation_plot

human_errors = experiment_data %>%
  group_by(type,ordered_predictor_city) %>%
  summarize(
    average_error = mean(abs_error),
    average_error_se = se(abs_error,n()),
    type = 'Human'
  )
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
bootstrap_errors = experiment_data %>%
  group_by(type,ordered_predictor_city) %>%
  summarize(
    average_error = mean(bootstrap_error),
    average_error_se = se(bootstrap_error,n()),
    type = 'Bootstrap'
  )
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
bootstrap_by_city = human_errors %>%
  rbind(bootstrap_errors) %>%
  mutate(type = factor(type,levels=c('Human', 'Bootstrap'))) %>%
  ggplot(aes(x=type,y=average_error,shape=type)) +
  geom_point(size=2) +
  geom_errorbar(aes(ymin=average_error-average_error_se,ymax=average_error+average_error_se),
                width=0.45,size=0.7) +
  facet_wrap(. ~ ordered_predictor_city, ncol=8) +
  scale_color_manual(values = tableau_palette) +
  scale_y_continuous(labels = function(y) paste0(y, "°")) + 
  labs(
    y='Mean absolute error',
    color='Condition',
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position='bottom'
    )

average_error_by_city

city_data = experiment_data %>%
  group_by(pretty_condition,type,ordered_predictor_city) %>%
  summarize(
    average_error = mean(bootstrap_error),
    average_error_se = se(bootstrap_error,n())
  ) 
## `summarise()` has grouped output by 'pretty_condition', 'type'. You can
## override using the `.groups` argument.
average_bootstrap_error_by_city = city_data %>%
  ggplot(aes(x=pretty_condition,y=average_error,color=pretty_condition,shape=pretty_condition)) +
  geom_point(size=2) +
  geom_errorbar(aes(ymin=average_error-average_error_se,ymax=average_error+average_error_se),
                width=0.45,size=0.7) +
  facet_wrap(. ~ ordered_predictor_city, ncol=8) +
  scale_color_manual(values = tableau_palette) +
  scale_y_continuous(labels = function(y) paste0(y, "°")) + 
  labs(
    y='Mean absolute error',
    color='Condition',
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position='bottom'
    )

average_bootstrap_error_by_city